I have been looking around for a good tutorial to get my hands dirty with Python to build a recommender system, and came across this gem of a blog. Here is the notebook that I am using as a placeholder for notes for myself as I go through the tutorial.
In [1]:
import numpy as np; import pandas as pd
In [3]:
# !curl -O http://files.grouplens.org/datasets/movielens/ml-100k.zip
# !unzip ml-100k.zip
In [5]:
names = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('data/ml-100k/u.data', sep='\t', names=names)
df.head()
Out[5]:
In [7]:
n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
print('{} users\n{} items'.format(n_users, n_items))
In [8]:
ratings = np.zeros((n_users, n_items))
for row in df.itertuples():
ratings[row[1]-1, row[2]-1] = row[3]
ratings
Out[8]:
In [10]:
sparsity = float(len(ratings.nonzero()[0]))
sparsity /= (ratings.shape[0] * ratings.shape[1])
sparsity *= 100
print('Sparsity: {:4.2f}%'.format(sparsity))
In [13]:
def train_test_split(ratings):
test = np.zeros(ratings.shape)
train = ratings.copy()
for user in range(ratings.shape[0]):
test_ratings = np.random.choice(ratings[user, :].nonzero()[0],
size=10,
replace=False)
train[user, test_ratings] = 0.
test[user, test_ratings] = ratings[user, test_ratings]
# Test and training are truly disjoint
assert(np.all((train * test) == 0))
return train, test
In [14]:
train, test = train_test_split(ratings)
In [15]:
def fast_similarity(ratings, kind='user', epsilon=1e-9):
# epsilon -> small number for handling dived-by-zero errors
if kind == 'user':
sim = ratings.dot(ratings.T) + epsilon
elif kind == 'item':
sim = ratings.T.dot(ratings) + epsilon
norms = np.array([np.sqrt(np.diagonal(sim))])
return (sim / norms / norms.T)
In [16]:
%timeit fast_similarity(train, kind='user')
In [18]:
user_similarity = fast_similarity(train, kind='user')
item_similarity = fast_similarity(train, kind='item')
print(item_similarity[:4, :4])
In [19]:
def predict_fast_simple(ratings, similarity, kind='user'):
if kind == 'user':
return similarity.dot(ratings) / np.array([np.abs(similarity).sum(axis=1)]).T
elif kind == 'item':
return ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
In [20]:
%timeit predict_fast_simple(train, user_similarity, kind='user')
In [25]:
from sklearn.metrics import mean_squared_error
def get_mse(pred, actual):
# Ignore nonzero terms.
pred = pred[actual.nonzero()].flatten()
actual = actual[actual.nonzero()].flatten()
return mean_squared_error(pred, actual)
In [26]:
item_prediction = predict_fast_simple(train, item_similarity, kind='item')
user_prediction = predict_fast_simple(train, user_similarity, kind='user')
print('User-based CF MSE: ' + str(get_mse(user_prediction, test)))
print('Item-based CF MSE: ' + str(get_mse(item_prediction, test)))
In [30]:
def predict_topk(ratings, similarity, kind='user', k=40):
pred = np.zeros(ratings.shape)
if kind == 'user':
for i in range(ratings.shape[0]):
top_k_users = [np.argsort(similarity[:,i])[:-k-1:-1]]
for j in range(ratings.shape[1]):
pred[i, j] = similarity[i, :][top_k_users].dot(ratings[:, j][top_k_users])
pred[i, j] /= np.sum(np.abs(similarity[i, :][top_k_users]))
if kind == 'item':
for j in range(ratings.shape[1]):
top_k_items = [np.argsort(similarity[:,j])[:-k-1:-1]]
for i in range(ratings.shape[0]):
pred[i, j] = similarity[j, :][top_k_items].dot(ratings[i, :][top_k_items].T)
pred[i, j] /= np.sum(np.abs(similarity[j, :][top_k_items]))
return pred
In [31]:
pred = predict_topk(train, user_similarity, kind='user', k=40)
print('Top-k User-based CF MSE: ' + str(get_mse(pred, test)))
pred = predict_topk(train, item_similarity, kind='item', k=40)
print('Top-k Item-based CF MSE: ' + str(get_mse(pred, test)))
In [32]:
k_array = [5, 15, 30, 50, 100, 200]
user_train_mse = []
user_test_mse = []
item_test_mse = []
item_train_mse = []
def get_mse(pred, actual):
pred = pred[actual.nonzero()].flatten()
actual = actual[actual.nonzero()].flatten()
return mean_squared_error(pred, actual)
for k in k_array:
user_pred = predict_topk(train, user_similarity, kind='user', k=k)
item_pred = predict_topk(train, item_similarity, kind='item', k=k)
user_train_mse += [get_mse(user_pred, train)]
user_test_mse += [get_mse(user_pred, test)]
item_train_mse += [get_mse(item_pred, train)]
item_test_mse += [get_mse(item_pred, test)]
In [34]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
pal = sns.color_palette("Set2", 2)
plt.figure(figsize=(8, 8))
plt.plot(k_array, user_train_mse, c=pal[0], label='User-based train', alpha=0.5, linewidth=5)
plt.plot(k_array, user_test_mse, c=pal[0], label='User-based test', linewidth=5)
plt.plot(k_array, item_train_mse, c=pal[1], label='Item-based train', alpha=0.5, linewidth=5)
plt.plot(k_array, item_test_mse, c=pal[1], label='Item-based test', linewidth=5)
plt.legend(loc='best', fontsize=20)
plt.xticks(fontsize=16);
plt.yticks(fontsize=16);
plt.xlabel('k', fontsize=30);
plt.ylabel('MSE', fontsize=30);